from google.colab import drive
drive.mount('/content/drive/')
Load required libraries
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Flatten, Bidirectional, GlobalMaxPool1D, SpatialDropout1D
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt
import re
import nltk
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords # Remove stop words
import string # Remove punctuation
from wordcloud import WordCloud
nltk.download('wordnet')
Load data file into a dataframe
file_path = '/content/drive/My Drive/input_data.xlsx'
data_df = pd.read_excel(file_path)
data_df.head()
EDA
Shape of the dataset
data_df.shape
Describe the dataset
data_df.describe()
Check data-types of columns
data_df.dtypes
Check for null values and replace them with stop words.
data_df.isnull().sum()
null_indices = np.where(pd.isnull(data_df))
# replace nulls with stop words
data_df["Short description"].fillna("the", inplace = True)
data_df["Description"].fillna("the", inplace = True)
data_df.isnull().any()
Check for assignment group wise count of tickets
group_count = data_df['Assignment group'].value_counts()
sns.barplot(group_count.index[:9], group_count.values[:9], alpha=0.8)
plt.title('Category wise number of tickets')
plt.ylabel('Number of tickets', fontsize=12)
plt.xlabel('Assignment groups', fontsize=12)
plt.show()
46% of tickets are assigned to assignment group "GRP_0".
Check for caller wise count of tickets
group_count = data_df['Caller'].value_counts()
sns.barplot(group_count.index[:9], group_count.values[:9], alpha=0.8)
plt.title('User wise number of tickets')
plt.ylabel('Number of tickets', fontsize=12)
plt.xlabel('Users', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()
The highest number of ticket created by a single user is 810 (approx 9% of all tickets).
Group 0 has a large number of tickets. Here is Group 0 analysis.
df2 = data_df[data_df['Assignment group']=='GRP_0']
pd.set_option("max_rows", None)
df2.dropna(axis=0,inplace=True)
df2.reset_index(drop=True,inplace=True)
group_type = []
for i in range (len(df2)):
search_list1 = ['log', 'login', 'account', 'username','password', 'join','id','access','internet']
search_list2 = ['software', 'server', 'vpn','microsoft','skype','gmail', 'outlook','email']
if re.compile('|'.join(search_list1),re.IGNORECASE).search(df2['Short description'][i]):
group_type.append('A')
elif re.compile('|'.join(search_list2),re.IGNORECASE).search(df2['Short description'][i]):
group_type.append('B')
else :
group_type.append('C')
df2['Group_Type'] = group_type
sns.countplot(df2['Group_Type'])
Find duplicates in data
data_df[data_df.duplicated()]
data_df[data_df.duplicated()].count()
There are 84 duplicate tickets having all 4 columns same.
Word cloud on Description column
desc = " ".join(des for des in data_df.Description)
wc_desc = WordCloud(background_color='white', max_words=200, width=400, height=400,random_state=10).generate(desc)
plt.figure(figsize=(10,10))
plt.imshow(wc_desc)
Word cloud on Short description column
sh_desc = " ".join(sh_des for sh_des in data_df['Short description'])
wc_sh_desc = WordCloud(background_color='white', max_words=200, width=400, height=400,random_state=10).generate(sh_desc)
plt.figure(figsize=(10,10))
plt.imshow(wc_sh_desc)
Group wise word cloud
plt.figure(figsize=(20,20))
for index, i in enumerate(data_df['Assignment group'].unique()):
s = str(i)
i = str(data_df[data_df['Assignment group']==s].Description)
i = WordCloud(background_color='white', max_words=200, width=400, height=400,random_state=10).generate(i)
c = index+1
plt.subplot(9,9,c)
plt.imshow(i)
plt.title(s)
Data Preprocessing
Remove duplicates
data_df.drop_duplicates(inplace=True)
data_df.reset_index(drop=True,inplace=True)
Remove "Reported by emailid" words
#replacing email ids using caller column
import math
df1=[]
str1="";
validstring=1;
for i in data_df.index:
validstring = 1;
str1 = data_df.iloc[i].Description
if str1 == '' or pd.isnull(str1):
validstring=0;
if validstring != 0:
a = data_df.iloc[i].Caller.split()
tp = a[0] + '.' + a[1] + '@gmail.com'
if re.search( tp, data_df.iloc[i].Description ):
str1 = data_df.iloc[i].Description.replace(tp,'')
#print(i ," replaced")
df1.append(str1)
tp = pd.DataFrame(df1,columns=["Description"])
#replacing 'received from:' string
df2=[]
validstring=1;
for i in tp.index:
validstring=1;
str1 = data_df.iloc[i].Description
testString = 'received from:'
if str1 == '' or pd.isnull(str1):
validstring=0;
if validstring != 0:
if re.search( testString, tp.iloc[i].Description ):
str1 = tp.iloc[i].Description.replace(testString,'')
#print(i ," replaced")
df2.append(str1)
tp2 = pd.DataFrame(df2,columns=["Description"])
#Remove all remaining email ids
for i in tp2.index:
tp2.iloc[i].Description = tp2.iloc[i].Description.replace('\n', ' ').replace('\r', '')
tp2.iloc[i].Description = re.sub(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)", '', tp2.iloc[i].Description)
STOPWORDS = set(stopwords.words('english'))
esc2 = " ".join(des for des in tp2.Description)
wc_desc = WordCloud(background_color='white',stopwords=STOPWORDS, max_words=200, width=400, height=400,random_state=10).generate(esc2)
plt.figure(figsize=(10,10))
plt.imshow(wc_desc)
copy_df = data_df
copy_df["Description"] = df2
copy_df.head()
data_df = copy_df
data_df.head()
Data normalization. Acronyms handling
contradictions = {
"ain't": "am not / are not / is not / has not / have not",
"aren't": "are not / am not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had / he would",
"he'd've": "he would have",
"he'll": "he shall / he will",
"he'll've": "he shall have / he will have",
"he's": "he has / he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how has / how is / how does",
"I'd": "I had / I would",
"I'd've": "I would have",
"I'll": "I shall / I will",
"I'll've": "I shall have / I will have",
"I'm": "I am",
"I've": "I have",
"isn't": "is not",
"it'd": "it had / it would",
"it'd've": "it would have",
"it'll": "it shall / it will",
"it'll've": "it shall have / it will have",
"it's": "it has / it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had / she would",
"she'd've": "she would have",
"she'll": "she shall / she will",
"she'll've": "she shall have / she will have",
"she's": "she has / she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as / so is",
"that'd": "that would / that had",
"that'd've": "that would have",
"that's": "that has / that is",
"there'd": "there had / there would",
"there'd've": "there would have",
"there's": "there has / there is",
"they'd": "they had / they would",
"they'd've": "they would have",
"they'll": "they shall / they will",
"they'll've": "they shall have / they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had / we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what shall / what will",
"what'll've": "what shall have / what will have",
"what're": "what are",
"what's": "what has / what is",
"what've": "what have",
"when's": "when has / when is",
"when've": "when have",
"where'd": "where did",
"where's": "where has / where is",
"where've": "where have",
"who'll": "who shall / who will",
"who'll've": "who shall have / who will have",
"who's": "who has / who is",
"who've": "who have",
"why's": "why has / why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had / you would",
"you'd've": "you would have",
"you'll": "you shall / you will",
"you'll've": "you shall have / you will have",
"you're": "you are",
"you've": "you have"
}
print(stopwords.words('english'))
The below method is used to clean the incoming text. The text is first converted to lower case,then contradictions are normalized and porter stemming is applied. After that the punctuations, special characters and stop words are removed.We also have some common stop words found within the dataset which we have removed as well.
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z]')
disclaimers = ['select the following link to view the disclaimer in an alternate language.', 'this communication (including any accompanying documents) is intended only for the sole use of the person(s) to whom it is addressed and may contain information that is privileged,confidential and exempt from disclosure. any unauthorised reading,dissemination ,distribution,duplication of this communication by someone other than the intended recipient is strictly prohibited. if your receipt of this communication is in error,please notify the sender and destrtgoy the original communication immediately','please do not print this email unless it is absolutely necessary. spread environmental awareness','this mailbox is not monitored.please call support at the phone number in this communication for any questions you may have']
def clean_text(text, remove_stopwords=True):
STOPWORDS = set(stopwords.words('english'))
porter = PorterStemmer()
# Convert words to lower case
text = text.lower()
# Remove all disclaimers
for i in disclaimers:
text = text.replace(i,'')
#Remove numbers
text = re.sub('[0-9]', ' ', text)
if True:
text = text.split()
if not text:
return ' '
new_text = []
#Remove contradictions and punctuations
for word in text:
if word in contradictions:
new_text.append(porter.stem(contradictions[word]))
elif word in string.punctuation:
new_text.append("")
else:
new_text.append(porter.stem(word))
text = " ".join(new_text)
text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
text = BAD_SYMBOLS_RE.sub(' ', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing.
#Remove english and custom stop words
if remove_stopwords:
text = text.split()
stops = set(stopwords.words("english"))
text = [w for w in text if not w in stops]
newstop_words = ['yes','no','na','mii','hii','hello','hi','help','please','receiv','received','dear','company','from','sent','to','subject','mailto','email','unabl','need','pleas','issu','com','compani','kennametal.com','http','widia','regards','see','phone','thanks','thankyou','bitte']
text = [w for w in text if not w in newstop_words]
text = " ".join(w for w in text if not w in stops)
return text
data_df['Description'] = data_df['Description'].apply(clean_text)
data_df['Short description'] = data_df['Short description'].apply(clean_text)
data_df.isnull().any()
data_df.head()
Word cloud on description column after data clensing
STOPWORDS = set(stopwords.words('english'))
esc3 = " ".join(des for des in data_df.Description)
wc_desc = WordCloud(background_color='white',stopwords=STOPWORDS, max_words=200, width=400, height=400,random_state=10).generate(esc3)
plt.figure(figsize=(10,10))
plt.imshow(wc_desc)
Word cloud on Short description column after data clensing
sh_desc2 = " ".join(sh_des for sh_des in data_df['Short description'])
wc_sh_desc = WordCloud(background_color='white',stopwords=STOPWORDS, max_words=200, width=400, height=400,random_state=10).generate(sh_desc2)
plt.figure(figsize=(10,10))
plt.imshow(wc_sh_desc)
Group wise word cloud
plt.figure(figsize=(20,20))
for index, i in enumerate(data_df['Assignment group'].unique()):
s = str(i)
i = str(data_df[data_df['Assignment group']==s].Description)
i = WordCloud(background_color='white',stopwords=STOPWORDS, max_words=200, width=400, height=400,random_state=10).generate(i)
c = index+1
plt.subplot(9,9,c)
plt.imshow(i)
plt.title(s)
data_df.to_csv("all_74_groups_multilingual.csv")
Dealing with Languages other than English
pip install fasttext
limodel = '/content/drive/My Drive/lid.176.ftz'
import fasttext
lid_model = fasttext.load_model(limodel)
def predict_lang(model,texts): return model.predict(texts,k=1)
otherLanguagesDf = pd.DataFrame()
for index in data_df.index:
prediction = predict_lang(lid_model,data_df.iloc[index]["Description"])
label = prediction[0][0].split("__label__")[1]
confidence = prediction[1][0]
if label != "en" and confidence > 0.50:
print (index)
otherLanguagesDf = otherLanguagesDf.append({'index':index ,'label': label,"confidence": confidence, "text":data_df.iloc[index]["Description"]}, ignore_index=True)
plt.title("Contribution of other languages in the dataset")
sns.countplot(otherLanguagesDf['label'])
otherLanguagesDf['label'].value_counts()
#Total other languages are 7% of whole dataset. We can remove these entries to make our dataset comprise only of English language tickets.
indexesToRemove = otherLanguagesDf["index"]
data_df = data_df.drop(indexesToRemove)
data_df.shape
data_df.isnull().any()
data_df.to_csv("all_74_groups_only_english.csv")
Model building
Merge all columns into a single column
data_df['MergedColumn'] = data_df[data_df.columns[0:3]].apply(
lambda x: ' '.join(x.astype(str)),
axis=1
)
data_df = data_df.drop(['Short description','Description','Caller'],axis=1)
data_df.head()
Find the maximum length of the Merged column
max = 1
maxindex = 0;
lengthsdf = pd.DataFrame()
for ind in data_df.index:
if len(data_df['MergedColumn'][ind]) > max:
max = len(data_df['MergedColumn'][ind])
maxindex = ind
print(max)
Tokenize the Merged column
max_features = max
maxlen = 200
embedding_size = 200
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(data_df['MergedColumn'])
X = tokenizer.texts_to_sequences(data_df['MergedColumn'])
X = pad_sequences(X, maxlen = maxlen)
y = np.asarray(data_df['Assignment group'])
y = pd.get_dummies(data_df['Assignment group']).values
Split data into train and test datasets
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)
Extract the GloVe embedding file
project_path = '/content/drive/My Drive/'
glove_file = project_path + "Copy_of_glove.6B.zip"
#Extract Glove embedding zip file
from zipfile import ZipFile
with ZipFile(glove_file, 'r') as z:
z.extractall()
Embed each word
EMBEDDING_FILE = './glove.6B.200d.txt'
embeddings = {}
for o in open(EMBEDDING_FILE):
word = o.split(" ")[0]
# print(word)
embd = o.split(" ")[1:]
embd = np.asarray(embd, dtype='float32')
# print(embd)
embeddings[word] = embd
Find the num_words and create embedding matrix
num_words = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((num_words, 200))
for word, i in tokenizer.word_index.items():
embedding_vector = embeddings.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
len(embeddings.values())
num_words
Create model skeleton
model = Sequential()
model.add(Embedding(num_words, embedding_size, weights = [embedding_matrix]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(74, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
Train the model
epochs = 30
batch_size = 64
history = model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1)
Test the model on test set
loss, accuracy = model.evaluate(x_test, y_test, verbose=0)
print('Accuracy: %f' % (accuracy*100))
Plot the accuracy and loss functions
plt.title('Accuracy')
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='test')
plt.legend()
plt.show();
plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show();
Conclusion - The performance of model on test set is 67.07%. The loss and accuracy curve suggests that this is due to biased nature of dataset.